#!/usr/bin/python3
#
# MSX BASIC tokenizer
#
# Copyright © 2020 Pedro Gimeno Fortea
#
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.


import sys, re, io, struct
assert sys.hexversion >= 0x3050300, "Invalid Python version, must be 3.5.3+"

tokens1MSX = [
  "", "END", "FOR", "NEXT", "DATA", "INPUT", "DIM", "READ", "LET",
  "GOTO", "RUN", "IF", "RESTORE", "GOSUB", "RETURN", "REM", "STOP",
  "PRINT", "CLEAR", "LIST", "NEW", "ON", "WAIT", "DEF", "POKE", "CONT",
  "CSAVE", "CLOAD", "OUT", "LPRINT", "LLIST", "CLS", "WIDTH", "ELSE",
  "TRON", "TROFF", "SWAP", "ERASE", "ERROR", "RESUME", "DELETE",
  "AUTO", "RENUM", "DEFSTR", "DEFINT", "DEFSNG", "DEFDBL", "LINE",
  "OPEN", "FIELD", "GET", "PUT", "CLOSE", "LOAD", "MERGE", "FILES",
  "LSET", "RSET", "SAVE", "LFILES", "CIRCLE", "COLOR", "DRAW", "PAINT",
  "BEEP", "PLAY", "PSET", "PRESET", "SOUND", "SCREEN", "VPOKE",
  "SPRITE", "VDP", "BASE", "CALL", "TIME", "KEY", "MAX", "MOTOR",
  "BLOAD", "BSAVE", "DSKO$", "SET", "NAME", "KILL", "IPL", "COPY", "CMD",
  "LOCATE", "TO", "THEN", "TAB(", "STEP", "USR", "FN", "SPC(", "NOT",
  "ERL", "ERR", "STRING$", "USING", "INSTR", "'", "VARPTR", "CSRLIN",
  "ATTR$", "DSKI$", "OFF", "INKEY$", "POINT", ">", "=", "<", "+", "-", "*",
  "/", "^", "AND", "OR", "XOR", "EQV", "IMP", "MOD", "\\",  "", "", "",
]

tokens2 = [
  "", "LEFT$", "RIGHT$", "MID$", "SGN", "INT", "ABS", "SQR", "RND", "SIN",
  "LOG", "EXP", "COS", "TAN", "ATN", "FRE", "INP", "POS", "LEN", "STR$",
  "VAL", "ASC", "CHR$", "PEEK", "VPEEK", "SPACE$", "OCT$", "HEX$",
  "LPOS", "BIN$", "CINT", "CSNG", "CDBL", "FIX", "STICK", "STRIG", "PDL",
  "PAD", "DSKF", "FPOS", "CVI", "CVS", "CVD", "EOF", "LOC", "LOF", "MKI$",
  "MKS$", "MKD$", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
]

trans = (
  '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F'
  '\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
  ' !"#$%&\'()*+,-./0123456789:;<=>?'
  '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'
  '`abcdefghijklmnopqrstuvwxyz{|}~\x7F'
  'ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒ'
  'áíóúñÑªº¿⌐¬½¼¡«»ÃãĨĩÕõŨũĲĳ¾∽◊‰¶§'
  '▂▚▆🮂▬🮅▎▞▊🮇🮊🮘🮙🭭🭯🭬🭮🮚🮛▘▗▝▖🮖Δ‡ω█▄▌▐▀'
  'αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²■█'
)

# Token parsing modes, affects how integers are parsed
ModeFloat = 0
ModeUint = 1
ModeLiteral = 2

# Create charset translation tables from the above
msx_s2b = {}
msx_b2s = {}

for k, v in enumerate(trans):
  b = bytes((k,))
  msx_s2b[v] = b
  msx_b2s[b] = v

def encode(s):
  return b''.join([msx_s2b[c] for c in s])

# Exception for the BASIC errors
class BasicError(Exception):
  pass


def num2bytes(s, tok_mode):
  """Translate a decimal number string to floating point"""

  assert tok_mode != ModeLiteral
  assert b'e' not in s, "Lower-case exponent in number"

  # Remove embedded spaces and leading zeros
  s = s.replace(b' ', b'').lstrip(b'0')

  # Unsigned Int is simple
  if tok_mode == ModeUint:
    return struct.pack('<BH', 0x0E, int(s))

  # Find suffix
  suffix = s[-1:]
  if suffix in {b'%', b'!', b'#'}:
    s = s[:-1]
  else:
    suffix = b''

  # Remove point and exponent after taking note of their position and value
  point = s.find(b'.')
  s = s.replace(b'.', b'')
  ep = s.upper().find(b'E')  # exponent pointer

  exp = 0  # exponent
  has_exp = ep != -1
  if not has_exp:
    ep = len(s)
  else:
    tmp = s[ep + 1:]
    s = s[:ep]
    if tmp not in {b'', b'+', b'-'}:
      exp = int(tmp, 10)

  has_point = point != -1
  if point == -1:
    point = ep
  exp += point + 0x40  # apply bias

  # Remove zeros after the decimal point and subtract the count from exp
  orig_len = len(s)
  s = s.lstrip(b'0')
  exp -= orig_len - len(s)
  point -= orig_len - len(s)
  # Pad with zeros to the right
  s += b'0' * (14 - len(s))

  # Calculate the integer value for the case of % or no point/exp
  nint = 0
  if point > 0:
    # Perform rounding of the 14th decimal
    nint = int(s[:point])
    if point <= 14 and (s[point:14] == b'9' * (14 - point)
                        and s[14:15] >= b'5'):
      nint += 1

  if suffix == b'%' or (not has_point and not has_exp and suffix == b''
                        and 0 <= nint <= 32767):
    if not (0 <= nint <= 32767):
      raise BasicError("Overflow")
    if nint < 10:
      return struct.pack('<B', 0x11 + nint)
    if nint < 256:
      return struct.pack('<BB', 0x0F, nint)
    return struct.pack('<Bh', 0x1C, nint)

  n = int(s[:14])
  if suffix == b'!' or (n % 100000000 == 0 and suffix != b'#'):
    # Handle as single-precision
    n = (n + 50000000) // 100000000

    if n > 999999:
      # Replicate a bug in MS's tokenizer where 9.999995! is tokenized as 1!
      # (uncomment to fix the bug)
      #exp += 1
      n //= 10
    assert 0 <= n <= 999999
    if n == 0: exp = 0
    if exp > 127 or exp < 0:
      raise BasicError("Overflow")
    return struct.pack('>BL', 0x1D, int(str(n), 16) + exp * 0x1000000)

  if s[14:15] >= b'5':
    n += 1
  if n > 99999999999999:
    exp += 1
    n //= 10
  assert 0 <= n <= 99999999999999
  if n == 0: exp = 0
  if exp > 127 or exp < 0:
    raise BasicError("Overflow")
  return struct.pack('>BLL', 0x1F,
    int(str(n // 100000000), 16) + exp * 0x1000000,
    int(str(n % 100000000), 16))


def tokenize(src, use_cr=False, type_mode=False, remove_spaces=False):
  """Options:
  type_mode:     True if it should behave like typing in a program; False if
                 it should behave like LOAD. When True:
                   - Spaces at EOL are removed.
                   - Entering line numbers >= 65530 raises Syntax Error.
  use_cr:        True to split lines at CR and ignore LF (like the real thing),
                 False to split lines at LF and ignore CR (multiplatform).
  remove_spaces: True to "minify" the lines by removing all spaces between
                 tokens.
  """

  src = encode(src)

  LeadingASCII = re.compile(b'^[A-Z]+')
  tok_translate = {}
  for k, v in enumerate(tokens1MSX):
    v = encode(v)
    assert k < 128 and v not in tok_translate
    if v != b'':
      tok_translate[v] = bytes((k+128,))

  for k, v in enumerate(tokens2):
    v = encode(v)
    assert k < 128 and v not in tok_translate
    if v != b'':
      tok_translate[v] = bytes((255,k+128))

  # Special encoding of ' as :REM'
  tok_translate[b"'"] = b':' + tok_translate[b'REM'] + tok_translate[b"'"]
  # ? is parsed as PRINT
  tok_translate[b'?'] = tok_translate[b'PRINT']
  # GOTO admits the spelling GO TO but not e.g. GO  TO. GOSUB not affected.
  tok_translate[b'GO TO'] = tok_translate[b'GOTO']

  # Create list of keywords sorted by longest, escaped
  L = []
  for v in tokens1MSX + tokens2 + ['GO TO']:
    v = encode(v)
    if LeadingASCII.search(v):
      L.append(v)

  L.sort(key=len, reverse=True)  # longest match first
  # Escape for RE
  L = [re.escape(v) for v in L]

  # Tokenizer for decimal numbers.
  decimal = (
      br'|(?P<dec>(?:'
        # Cautiously avoid matching 1 space alone or 1 trailing space.
        br'(?:[0-9][0-9\ ]*)?\.(?:[0-9\ ]*[0-9])?' # at least a dot
        br'|(?:[0-9](?:[0-9\ ]*[0-9])?)'           # or at least a digit
      br')'                                        # not optional
      br'(?:'
        br'\ *[%!\#]'                              # suffix
        br'|\ *E\ *[-+]?(?:[0-9\ ]*[0-9])?'        # or exponent, but not both
      br')?)'
  )
  # Tokenizer for uint. Up to 6552, read up to 5 digits; from 6553 on, up to 4.
  # Used for line numbers, either leading or after GOTO/GOSUB/etc.
  uint = (br'|(?P<dec>'
    br'(?:0[0\ ]*)?'                             # leading zeros prefix
    br'(?:0'                                     # zero
    br'|[1-5](?:\ *[0-9]){,4}'                   # prefix 1..5999, 5 digits
    br'|6\ *[0-4](?:\ *[0-9]){,3}'               # prefix 6000..6499, 5 digits
    br'|6\ *5\ *[0-4](?:\ *[0-9]){,2}'           # prefix 6500..6549, 5 digits
    br'|6\ *5\ *5\ *[0-2](?:\ *[0-9])?'          # prefix 6550..6552, 5 digits
    br'|[6-9](?:\ *[0-9]){,3}'                   # rest, 4 digits
    br'))'
  )

  tokenizer = (br'(?:'
    br"(?P<rem>(?:REM|').*)"                  # comment
    br'|(?P<data>(?:DATA)(?:"[^"]*(?:"|$)|[^:])*)'  # data
    br'|(?P<call>(?:CALL|_)[^:(]*)'           # call
    br'|(?P<kw>' + br'|'.join(L) + br')'      # keywords
    br'|[A-Z](?:\ *[0-9.])*'                  # identifier
    br'|&H(?P<hex>[0-9A-F]*)'                 # hex number
    br'|&O(?P<oct>[0-7]*)'                    # octal number
    #br'|&B(?P<bin>[01]+)'  # binary numbers don't have tokens
    br'%s'                                    # decimal number
    br'|(?P<str>"(?:[^"]*)(?:"|$))'           # string literal
     b'|(?P<del>[\x80-\xFF])'                 # remove these
    br'|.'
    br')'
    )

  tokenizer_uint = re.compile(tokenizer % uint, re.I)
  tokenizer_float = re.compile(tokenizer % decimal, re.I)
  tokenizer_literal = re.compile(tokenizer % b'|(?P<dec>(?!))', re.I)

  del L, uint, decimal, LeadingASCII, tokenizer

  # Control Character Behaviour:
  # \x00: Terminates the line prematurely (ignores up to the next \x0D).
  # \x01-\x08: Skip it and next token.
  # \x09: Inserted verbatim, finish token.
  # \x0B-\x0C: Replaced by spaces
  # \x0E-\x0F,\x11-19,\x1B-\x1F: Inserted verbatim.
  # \x0A: Ignored (skipped).
  # \x0D: EOL
  # \x10: Undefined effects
  #       Two effects we got:
  #       - Hang the machine.
  #       - Delete last byte and insert current line's start address.
  # \x1A: EOF. Stops further processing.
  # \x7F: Unknown

  leadingdigit = re.compile(br'[\ \x01-\x1F\x80-\xFF]*(?:([0-9]?))')
  trunc_at_null = re.compile(br'\x00.*', re.S)
  call_strip = re.compile(br'[^0-\x7F\ (]*')

  # Compile the BASIC to a buffer
  buf = io.BytesIO()

  # Truncate source at \x1A (^Z)
  src = re.sub(b'\x1A.*', b'', src, flags=re.S)
  if use_cr:
    # realistic
    src = src.split(b'\r')
    ignore = b'\n'
  else:
    # practical
    src = src.split(b'\n')
    ignore = b'\r'

  filestart = buf.tell()

  # First pass: Read the lines and tokenize them into a dict with the line
  # number as the key. Handle line deletion, overwriting, etc.
  PRGLines = {}
  for line in src:
    line = trunc_at_null.sub(b'', line.replace(ignore, b'').lstrip(b' '))
    if type_mode: line = line.rstrip(b' ')
    if line == b'':
      continue

    g = tokenizer_uint.match(line)
    # Error if not numeric
    if not g.group('dec'):
      # we can't execute BASIC statements even in typing mode
      raise BasicError("Direct statement in file")

    p = g.end()
    linenum = int(g.group().replace(b' ', b''))
    if linenum != 0:
      # The first space is not removed for line 0
      if line[p:p+1] == b' ':
        p += 1

    g = leadingdigit.match(line, p)
    if type_mode:
      # Entering e.g.
      #   65530
      # alone raises Syntax error. In LOAD mode, it enters 6553 0
      if group(1): raise BasicError("Syntax error")
    if not g.group(1) and g.end(0) == len(line):
      # Delete line
      if linenum not in PRGLines:
        raise BasicError("Undefined line number")
      del PRGLines[linenum]
      continue

    lbuf = io.BytesIO()
    tok_mode = ModeFloat

    while True:
      nextch = line[p:p+1]
      if tok_mode == ModeUint and (nextch == b':'
                                   or b'A' <= nextch <= b'Z'
                                   or b'a' <= nextch <= b'z'):
        tok_mode = ModeFloat

      if p == len(line): break

      tokenizer = tokenizer_float
      if tok_mode == ModeUint:
        tokenizer = tokenizer_uint
      elif tok_mode == ModeLiteral:
        tokenizer = tokenizer_literal
      g = tokenizer.match(line, p)

      assert g, "No match in tokenizer for " + line[p]
      match = g.group().upper()
      p = g.end()

      # Handle control chars... somewhat
      if match in {b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06',
                   b'\x07', b'\x08'}:
        # Eat a token
        g = tokenizer.match(line, p)
        p = g.end()
        continue

      if tok_mode != ModeUint and match in {b'\x0B', b'\x0C', b'\x0E',
          b'\x0F', b'\x10', b'\x11', b'\x12', b'\x13', b'\x14', b'\x15',
          b'\x16', b'\x17', b'\x18', b'\x19', b'\x1B', b'\x1C', b'\x1D',
          b'\x1E', b'\x1F'}:
        lbuf.write(b' ')
        tok_mode = ModeFloat
        continue

      # Handle token types
      if g.group('rem'):
        if g.group().startswith(b"'"):
          lbuf.write(tok_translate[b"'"])
          lbuf.write(g.group()[1:])
        else:
          lbuf.write(tok_translate[b'REM'])
          lbuf.write(g.group()[3:])
      elif g.group('data'):
          lbuf.write(tok_translate[b'DATA'])
          lbuf.write(g.group()[4:])
      elif g.group('call'):
        match = call_strip.sub(b'', match)
        if match.startswith(b'_'):
          lbuf.write(match)
        else:  # starts with 'CALL'
          lbuf.write(tok_translate[b'CALL'])
          lbuf.write(match[4:])
      elif g.group('kw'):
        # keyword
        if match in {b'GOTO', b'GO TO', b'GOSUB', b'THEN', b'ELSE',
            b'LIST', b'DELETE', b'AUTO', b'LLIST', b'RENUM', b'RESUME'}:
          tok_mode = ModeUint
        lbuf.write(tok_translate[match])
      elif g.group('hex') is not None:
        # hex literal
        lbuf.write(b'\x0C')
        num = int(g.group('hex'), 16) if g.group('hex') != b'' else 0
        if num > 65535:
          raise BasicError("Overflow")
        lbuf.write(struct.pack('<H', num))
      elif g.group('oct') is not None:
        # oct literal
        lbuf.write(b'\x0B')
        num = int(g.group('oct'), 8) if g.group('oct') != b'' else 0
        if num > 65535:
          raise BasicError("Overflow")
        lbuf.write(struct.pack('<H', num))
      elif g.group('dec'):
        # dec literal
        lbuf.write(num2bytes(match, tok_mode))
      elif g.group('str'):
        # string
        lbuf.write(g.group())
      elif g.group('del'):
        # characters > 127 aren't written
        pass
      else:
        if len(match) == 1 and (b'A' <= match <= b'Z' or b'0' <= match <= b'9'):
          tok_mode = ModeLiteral  # for identifiers like A1
        elif tok_mode == ModeLiteral:
          tok_mode = ModeFloat    # revert to float mode otherwise
        if remove_spaces and match == b' ':
          continue
        if match in tok_translate:
          # some symbols need to be translated to tokens for some reason
          lbuf.write(tok_translate[match])
        else:
          lbuf.write(match)
    lbuf.write(b'\0')

    PRGLines[linenum] = lbuf.getvalue()


  # Second pass - Write remaining lines in order
  addr = 0x8001
  for linenum in sorted(PRGLines.keys()):
    line = PRGLines[linenum]
    addr += len(line) + 4
    buf.write(struct.pack('<HH', addr, linenum))
    buf.write(line)

  buf.write(b'\0\0')

  return buf.getvalue()


def main():
  f = open(sys.argv[1], 'r', newline='')
  try:
    lines = f.read()
  finally:
    f.close()
  out = tokenize(lines, use_cr=len(sys.argv) > 3 and sys.argv[3] == '1')
  f = open(sys.argv[2], 'wb')
  try:
    f.write(b'\xFF' + out)
  finally:
    f.close()


if __name__ == '__main__':
  main()
